In [38]:
import numpy as np
np.random.seed(7)
In [39]:
import tensorflow as tf
tf.__version__
Out[39]:
'2.8.0'
In [40]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import random, re
import time

# used to supress display of warnings
import warnings

import missingno as mno

# nlp libraries
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from tqdm import tqdm
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from sklearn.feature_extraction.text import TfidfVectorizer

import holoviews as hv
from holoviews import opts

import os;
from os import makedirs

# sampling methods
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

# import zscore for scaling the data
from scipy.stats import zscore

from scipy.stats import randint as sp_randint

# save models
import pickle

# pre-processing methods
from sklearn.model_selection import train_test_split
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [41]:
# the classification models 
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# ensemble models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# methods and classes for evaluation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, precision_score, roc_auc_score

# cross-validation methods
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# feature selection methods
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV

# pre-processing methods
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import LabelEncoder

## for bag-of-words
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing
## for explainer

## for word embedding
import gensim
import gensim.downloader as gensim_api
## for deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K
In [42]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Capstone/IHMStefanini_industrial_safety_and_health_database_with_accidents_description.csv")
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
In [42]:

In [43]:
data.head()
Out[43]:
Unnamed: 0 Data Countries Local Industry Sector Accident Level Potential Accident Level Genre Employee or Third Party Critical Risk Description
0 0 2016-01-01 00:00:00 Country_01 Local_01 Mining I IV Male Third Party Pressed While removing the drill rod of the Jumbo 08 f...
1 1 2016-01-02 00:00:00 Country_02 Local_02 Mining I IV Male Employee Pressurized Systems During the activation of a sodium sulphide pum...
2 2 2016-01-06 00:00:00 Country_01 Local_03 Mining I III Male Third Party (Remote) Manual Tools In the sub-station MILPO located at level +170...
3 3 2016-01-08 00:00:00 Country_01 Local_04 Mining I I Male Third Party Others Being 9:45 am. approximately in the Nv. 1880 C...
4 4 2016-01-10 00:00:00 Country_01 Local_04 Mining IV IV Male Third Party Others Approximately at 11:45 a.m. in circumstances t...
In [44]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425 entries, 0 to 424
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Unnamed: 0                425 non-null    int64 
 1   Data                      425 non-null    object
 2   Countries                 425 non-null    object
 3   Local                     425 non-null    object
 4   Industry Sector           425 non-null    object
 5   Accident Level            425 non-null    object
 6   Potential Accident Level  425 non-null    object
 7   Genre                     425 non-null    object
 8   Employee or Third Party   425 non-null    object
 9   Critical Risk             425 non-null    object
 10  Description               425 non-null    object
dtypes: int64(1), object(10)
memory usage: 36.6+ KB
In [45]:
#Shape of the data
print("Number of rows = {0} and Number of Columns = {1} in the Data frame".format(data.shape[0], data.shape[1]))
Number of rows = 425 and Number of Columns = 11 in the Data frame
  • From the above output, we see that except first column all other columns datatype is object.
  • Categorical columns - 'Countries', 'Local', 'Industry Sector', 'Accident Level', 'Potential Accident Level', 'Genre', 'Employee or Third Party', 'Critical Risk', 'Description'
  • Date column - 'Data'

  • There are about 425 rows and 11 columns in the dataset.

  • We noticed that except a 'date' column all other columns are categorical columns.
In [46]:
#Data Cleansing
 #Remove 'Unnamed: 0' column from Data frame
data.drop("Unnamed: 0", axis=1, inplace=True)

#Rename 'Data', 'Countries', 'Genre', 'Employee or Third Party' columns in Data frame
data.rename(columns={'Data':'Date', 'Countries':'Country', 'Genre':'Gender', 'Employee or Third Party':'Employee type'}, inplace=True)

#Get the top 3 rows
data.head(3)
Out[46]:
Date Country Local Industry Sector Accident Level Potential Accident Level Gender Employee type Critical Risk Description
0 2016-01-01 00:00:00 Country_01 Local_01 Mining I IV Male Third Party Pressed While removing the drill rod of the Jumbo 08 f...
1 2016-01-02 00:00:00 Country_02 Local_02 Mining I IV Male Employee Pressurized Systems During the activation of a sodium sulphide pum...
2 2016-01-06 00:00:00 Country_01 Local_03 Mining I III Male Third Party (Remote) Manual Tools In the sub-station MILPO located at level +170...
In [47]:
# Check duplicates in a data frame
data.duplicated().sum()
Out[47]:
7
In [48]:
#View the duplicate records
duplicates = data.duplicated()

data[duplicates]
Out[48]:
Date Country Local Industry Sector Accident Level Potential Accident Level Gender Employee type Critical Risk Description
77 2016-04-01 00:00:00 Country_01 Local_01 Mining I V Male Third Party (Remote) Others In circumstances that two workers of the Abrat...
262 2016-12-01 00:00:00 Country_01 Local_03 Mining I IV Male Employee Others During the activity of chuteo of ore in hopper...
303 2017-01-21 00:00:00 Country_02 Local_02 Mining I I Male Third Party (Remote) Others Employees engaged in the removal of material f...
345 2017-03-02 00:00:00 Country_03 Local_10 Others I I Male Third Party Venomous Animals On 02/03/17 during the soil sampling in the re...
346 2017-03-02 00:00:00 Country_03 Local_10 Others I I Male Third Party Venomous Animals On 02/03/17 during the soil sampling in the re...
355 2017-03-15 00:00:00 Country_03 Local_10 Others I I Male Third Party Venomous Animals Team of the VMS Project performed soil collect...
397 2017-05-23 00:00:00 Country_01 Local_04 Mining I IV Male Third Party Projection of fragments In moments when the 02 collaborators carried o...
In [49]:
#Delete duplicate rows
data.drop_duplicates(inplace=True)
In [50]:
data.shape
Out[50]:
(418, 10)
In [51]:
print("Number of rows = {0} and Number of Columns = {1} in the Data frame after removing the duplicates.".format(data.shape[0], data.shape[1]))
Number of rows = 418 and Number of Columns = 10 in the Data frame after removing the duplicates.
In [52]:
#Check unique values of all columns except 'Description' column
for x in data.columns:
    if x != 'Description':
      print('--'*30); print(f'Unique values of "{x}" column'); print('--'*30)
      print(data[x].unique())
      print('\n')
------------------------------------------------------------
Unique values of "Date" column
------------------------------------------------------------
['2016-01-01 00:00:00' '2016-01-02 00:00:00' '2016-01-06 00:00:00'
 '2016-01-08 00:00:00' '2016-01-10 00:00:00' '2016-01-12 00:00:00'
 '2016-01-16 00:00:00' '2016-01-17 00:00:00' '2016-01-19 00:00:00'
 '2016-01-26 00:00:00' '2016-01-28 00:00:00' '2016-01-30 00:00:00'
 '2016-02-01 00:00:00' '2016-02-02 00:00:00' '2016-02-04 00:00:00'
 '2016-02-06 00:00:00' '2016-02-07 00:00:00' '2016-02-08 00:00:00'
 '2016-02-21 00:00:00' '2016-02-25 00:00:00' '2016-02-09 00:00:00'
 '2016-02-10 00:00:00' '2016-02-15 00:00:00' '2016-02-14 00:00:00'
 '2016-02-13 00:00:00' '2016-02-16 00:00:00' '2016-02-17 00:00:00'
 '2016-02-19 00:00:00' '2016-02-20 00:00:00' '2016-02-18 00:00:00'
 '2016-02-22 00:00:00' '2016-02-24 00:00:00' '2016-02-29 00:00:00'
 '2016-02-26 00:00:00' '2016-02-27 00:00:00' '2016-03-02 00:00:00'
 '2016-03-03 00:00:00' '2016-03-04 00:00:00' '2016-03-05 00:00:00'
 '2016-03-06 00:00:00' '2016-03-09 00:00:00' '2016-03-11 00:00:00'
 '2016-03-13 00:00:00' '2016-03-12 00:00:00' '2016-03-14 00:00:00'
 '2016-03-16 00:00:00' '2016-03-10 00:00:00' '2016-03-17 00:00:00'
 '2016-03-18 00:00:00' '2016-03-19 00:00:00' '2016-03-22 00:00:00'
 '2016-03-25 00:00:00' '2016-03-30 00:00:00' '2016-03-31 00:00:00'
 '2016-04-01 00:00:00' '2016-04-03 00:00:00' '2016-04-02 00:00:00'
 '2016-03-24 00:00:00' '2016-04-04 00:00:00' '2016-04-05 00:00:00'
 '2016-04-07 00:00:00' '2016-04-08 00:00:00' '2016-04-11 00:00:00'
 '2016-04-14 00:00:00' '2016-04-16 00:00:00' '2016-04-15 00:00:00'
 '2016-04-17 00:00:00' '2016-04-18 00:00:00' '2016-04-21 00:00:00'
 '2016-04-22 00:00:00' '2016-04-23 00:00:00' '2016-04-26 00:00:00'
 '2016-04-28 00:00:00' '2016-04-29 00:00:00' '2016-04-30 00:00:00'
 '2016-05-01 00:00:00' '2016-05-02 00:00:00' '2016-05-04 00:00:00'
 '2016-05-03 00:00:00' '2016-05-05 00:00:00' '2016-05-11 00:00:00'
 '2016-05-12 00:00:00' '2016-05-14 00:00:00' '2016-05-17 00:00:00'
 '2016-05-19 00:00:00' '2016-05-18 00:00:00' '2016-05-22 00:00:00'
 '2016-05-20 00:00:00' '2016-05-24 00:00:00' '2016-05-25 00:00:00'
 '2016-05-27 00:00:00' '2016-05-26 00:00:00' '2016-06-01 00:00:00'
 '2016-06-02 00:00:00' '2016-06-03 00:00:00' '2016-06-04 00:00:00'
 '2016-06-05 00:00:00' '2016-06-08 00:00:00' '2016-06-07 00:00:00'
 '2016-06-10 00:00:00' '2016-06-13 00:00:00' '2016-06-16 00:00:00'
 '2016-06-18 00:00:00' '2016-06-17 00:00:00' '2016-06-19 00:00:00'
 '2016-06-21 00:00:00' '2016-06-22 00:00:00' '2016-06-23 00:00:00'
 '2016-06-24 00:00:00' '2016-06-29 00:00:00' '2016-07-02 00:00:00'
 '2016-07-04 00:00:00' '2016-07-08 00:00:00' '2016-07-07 00:00:00'
 '2016-07-09 00:00:00' '2016-07-10 00:00:00' '2016-07-11 00:00:00'
 '2016-07-14 00:00:00' '2016-07-15 00:00:00' '2016-07-16 00:00:00'
 '2016-07-18 00:00:00' '2016-07-20 00:00:00' '2016-07-21 00:00:00'
 '2016-07-23 00:00:00' '2016-07-27 00:00:00' '2016-07-29 00:00:00'
 '2016-07-30 00:00:00' '2016-08-02 00:00:00' '2016-08-01 00:00:00'
 '2016-08-04 00:00:00' '2016-08-11 00:00:00' '2016-08-12 00:00:00'
 '2016-08-14 00:00:00' '2016-08-15 00:00:00' '2016-08-18 00:00:00'
 '2016-08-19 00:00:00' '2016-08-22 00:00:00' '2016-08-24 00:00:00'
 '2016-08-25 00:00:00' '2016-08-29 00:00:00' '2016-08-27 00:00:00'
 '2016-08-30 00:00:00' '2016-09-01 00:00:00' '2016-09-02 00:00:00'
 '2016-09-04 00:00:00' '2016-09-03 00:00:00' '2016-09-06 00:00:00'
 '2016-09-05 00:00:00' '2016-09-13 00:00:00' '2016-09-12 00:00:00'
 '2016-09-15 00:00:00' '2016-09-17 00:00:00' '2016-09-16 00:00:00'
 '2016-09-20 00:00:00' '2016-09-21 00:00:00' '2016-09-22 00:00:00'
 '2016-09-27 00:00:00' '2016-09-29 00:00:00' '2016-09-30 00:00:00'
 '2016-10-01 00:00:00' '2016-10-03 00:00:00' '2016-10-04 00:00:00'
 '2016-10-08 00:00:00' '2016-10-10 00:00:00' '2016-10-11 00:00:00'
 '2016-10-13 00:00:00' '2016-10-18 00:00:00' '2016-10-20 00:00:00'
 '2016-10-23 00:00:00' '2016-10-24 00:00:00' '2016-10-26 00:00:00'
 '2016-10-27 00:00:00' '2016-10-29 00:00:00' '2016-11-04 00:00:00'
 '2016-11-08 00:00:00' '2016-11-11 00:00:00' '2016-11-13 00:00:00'
 '2016-11-19 00:00:00' '2016-11-21 00:00:00' '2016-11-23 00:00:00'
 '2016-11-25 00:00:00' '2016-11-28 00:00:00' '2016-11-29 00:00:00'
 '2016-11-30 00:00:00' '2016-12-01 00:00:00' '2016-12-08 00:00:00'
 '2016-12-09 00:00:00' '2016-12-10 00:00:00' '2016-12-12 00:00:00'
 '2016-12-13 00:00:00' '2016-12-15 00:00:00' '2016-12-16 00:00:00'
 '2016-12-19 00:00:00' '2016-12-23 00:00:00' '2016-12-22 00:00:00'
 '2016-12-26 00:00:00' '2016-12-28 00:00:00' '2016-12-30 00:00:00'
 '2016-12-31 00:00:00' '2017-01-02 00:00:00' '2017-01-05 00:00:00'
 '2017-01-06 00:00:00' '2017-01-07 00:00:00' '2017-01-08 00:00:00'
 '2017-01-09 00:00:00' '2017-01-10 00:00:00' '2017-01-12 00:00:00'
 '2017-01-14 00:00:00' '2017-01-17 00:00:00' '2017-01-20 00:00:00'
 '2017-01-21 00:00:00' '2017-01-23 00:00:00' '2017-01-24 00:00:00'
 '2017-01-25 00:00:00' '2017-01-27 00:00:00' '2017-01-29 00:00:00'
 '2017-01-28 00:00:00' '2017-01-31 00:00:00' '2017-02-01 00:00:00'
 '2017-02-04 00:00:00' '2017-02-05 00:00:00' '2017-02-07 00:00:00'
 '2017-02-08 00:00:00' '2017-02-09 00:00:00' '2017-02-13 00:00:00'
 '2017-02-14 00:00:00' '2017-02-15 00:00:00' '2017-02-16 00:00:00'
 '2017-02-17 00:00:00' '2017-02-23 00:00:00' '2017-02-25 00:00:00'
 '2017-02-26 00:00:00' '2017-02-27 00:00:00' '2017-03-01 00:00:00'
 '2017-03-02 00:00:00' '2017-03-04 00:00:00' '2017-03-06 00:00:00'
 '2017-03-08 00:00:00' '2017-03-09 00:00:00' '2017-03-10 00:00:00'
 '2017-03-15 00:00:00' '2017-03-18 00:00:00' '2017-03-22 00:00:00'
 '2017-03-25 00:00:00' '2017-03-31 00:00:00' '2017-04-04 00:00:00'
 '2017-04-05 00:00:00' '2017-04-07 00:00:00' '2017-04-06 00:00:00'
 '2017-04-10 00:00:00' '2017-04-08 00:00:00' '2017-04-11 00:00:00'
 '2017-04-13 00:00:00' '2017-04-12 00:00:00' '2017-04-23 00:00:00'
 '2017-04-19 00:00:00' '2017-04-25 00:00:00' '2017-04-24 00:00:00'
 '2017-04-28 00:00:00' '2017-04-29 00:00:00' '2017-04-30 00:00:00'
 '2017-05-05 00:00:00' '2017-05-06 00:00:00' '2017-05-10 00:00:00'
 '2017-05-16 00:00:00' '2017-05-17 00:00:00' '2017-05-18 00:00:00'
 '2017-05-19 00:00:00' '2017-05-23 00:00:00' '2017-05-30 00:00:00'
 '2017-06-04 00:00:00' '2017-06-09 00:00:00' '2017-06-11 00:00:00'
 '2017-06-14 00:00:00' '2017-06-15 00:00:00' '2017-06-17 00:00:00'
 '2017-06-18 00:00:00' '2017-06-24 00:00:00' '2017-06-20 00:00:00'
 '2017-06-23 00:00:00' '2017-06-19 00:00:00' '2017-06-22 00:00:00'
 '2017-06-29 00:00:00' '2017-07-04 00:00:00' '2017-07-05 00:00:00'
 '2017-07-06 00:00:00' '2017-07-09 00:00:00']


------------------------------------------------------------
Unique values of "Country" column
------------------------------------------------------------
['Country_01' 'Country_02' 'Country_03']


------------------------------------------------------------
Unique values of "Local" column
------------------------------------------------------------
['Local_01' 'Local_02' 'Local_03' 'Local_04' 'Local_05' 'Local_06'
 'Local_07' 'Local_08' 'Local_10' 'Local_09' 'Local_11' 'Local_12']


------------------------------------------------------------
Unique values of "Industry Sector" column
------------------------------------------------------------
['Mining' 'Metals' 'Others']


------------------------------------------------------------
Unique values of "Accident Level" column
------------------------------------------------------------
['I' 'IV' 'III' 'II' 'V']


------------------------------------------------------------
Unique values of "Potential Accident Level" column
------------------------------------------------------------
['IV' 'III' 'I' 'II' 'V' 'VI']


------------------------------------------------------------
Unique values of "Gender" column
------------------------------------------------------------
['Male' 'Female']


------------------------------------------------------------
Unique values of "Employee type" column
------------------------------------------------------------
['Third Party' 'Employee' 'Third Party (Remote)']


------------------------------------------------------------
Unique values of "Critical Risk" column
------------------------------------------------------------
['Pressed' 'Pressurized Systems' 'Manual Tools' 'Others'
 'Fall prevention (same level)' 'Chemical substances' 'Liquid Metal'
 'Electrical installation' 'Confined space'
 'Pressurized Systems / Chemical Substances'
 'Blocking and isolation of energies' 'Suspended Loads' 'Poll' 'Cut'
 'Fall' 'Bees' 'Fall prevention' '\nNot applicable' 'Traffic' 'Projection'
 'Venomous Animals' 'Plates' 'Projection/Burning' 'remains of choco'
 'Vehicles and Mobile Equipment' 'Projection/Choco' 'Machine Protection'
 'Power lock' 'Burn' 'Projection/Manual Tools'
 'Individual protection equipment' 'Electrical Shock'
 'Projection of fragments']


In [53]:
# Check the presence of missing values
data.isnull().sum()
Out[53]:
Date                        0
Country                     0
Local                       0
Industry Sector             0
Accident Level              0
Potential Accident Level    0
Gender                      0
Employee type               0
Critical Risk               0
Description                 0
dtype: int64
In [54]:
#Data Pre-processing
data['Date'] = pd.to_datetime(data['Date'])

data['Year'] = data.Date.apply(lambda x : x.year)
data['Month'] = data.Date.apply(lambda x : x.month)
data['Day'] = data.Date.apply(lambda x : x.day)
data['Weekday'] = data.Date.apply(lambda x : x.day_name())
data['WeekofYear'] = data.Date.apply(lambda x : x.weekofyear)

data.head()
Out[54]:
Date Country Local Industry Sector Accident Level Potential Accident Level Gender Employee type Critical Risk Description Year Month Day Weekday WeekofYear
0 2016-01-01 Country_01 Local_01 Mining I IV Male Third Party Pressed While removing the drill rod of the Jumbo 08 f... 2016 1 1 Friday 53
1 2016-01-02 Country_02 Local_02 Mining I IV Male Employee Pressurized Systems During the activation of a sodium sulphide pum... 2016 1 2 Saturday 53
2 2016-01-06 Country_01 Local_03 Mining I III Male Third Party (Remote) Manual Tools In the sub-station MILPO located at level +170... 2016 1 6 Wednesday 1
3 2016-01-08 Country_01 Local_04 Mining I I Male Third Party Others Being 9:45 am. approximately in the Nv. 1880 C... 2016 1 8 Friday 1
4 2016-01-10 Country_01 Local_04 Mining IV IV Male Third Party Others Approximately at 11:45 a.m. in circumstances t... 2016 1 10 Sunday 1
  • Removed 'Unnamed: 0' column and renamed - 'Data', 'Countries', 'Genre', 'Employee or Third Party' columns in the dataset.
  • We had 7 duplicate instances in the dataset and dropped those duplicates.
  • There are no outliers in the dataset.
  • No missing values in dataset.
  • We are left with 418 rows and 10 columns after data cleansing.

EDA

Variable Identification:

  • Target variable: 'Accident Level', 'Potential Accident Level'
  • Predictors (Input varibles): 'Date', 'Country', 'Local', 'Industry Sector', 'Gender', 'Employee type', 'Critical Risk', 'Description'
In [55]:
#Univariate Analysis

print('--'*30); print('Value Counts for `Country` label'); print('--'*30)

total_row_cnt = data.shape[0]
country_01_cnt = data[data.Country == 'Country_01'].shape[0]
country_02_cnt = data[data.Country == 'Country_02'].shape[0]
country_03_cnt = data[data.Country == 'Country_03'].shape[0]

print(f'Country_01 count: {country_01_cnt} i.e. {round(country_01_cnt/total_row_cnt*100, 0)}%')
print(f'Country_02 count: {country_02_cnt} i.e. {round(country_02_cnt/total_row_cnt*100, 0)}%')
print(f'Country_03 count: {country_03_cnt} i.e. {round(country_03_cnt/total_row_cnt*100, 0)}%')

print('--'*30); print('Distributon of `Country` label'); print('--'*30)
_ = data['Country'].value_counts().plot(kind = 'pie', autopct = '%.0f%%', labels = ['Country_01', 'Country_02', 'Country_03'], figsize = (10, 6))
------------------------------------------------------------
Value Counts for `Country` label
------------------------------------------------------------
Country_01 count: 248 i.e. 59.0%
Country_02 count: 129 i.e. 31.0%
Country_03 count: 41 i.e. 10.0%
------------------------------------------------------------
Distributon of `Country` label
------------------------------------------------------------
  • 59% accidents occurred in Country_01
  • 31% accidents occurred in Country_02
  • 10% accidents occurred in Country_03
In [56]:
local_cnt = np.round(data['Local'].value_counts(normalize=True) * 100)

hv.extension('bokeh')
hv.Bars(local_cnt).opts(title="Local Count", color="#8888ff", xlabel="Locals", ylabel="Percentage", yformatter='%d%%')\
                .opts(opts.Bars(width=700, height=300,tools=['hover'],show_grid=True))
Out[56]:
  • Highest manufacturing plants are located in Local_03 city.
  • Lowest manufacturing plants are located in Local_09 city.
In [57]:
print('--'*30); print('Value Counts for `Industry Sector` label'); print('--'*30)

Mining_cnt = data[data['Industry Sector'] == 'Mining'].shape[0]
Metals_cnt = data[data['Industry Sector'] == 'Metals'].shape[0]
Others_cnt = data[data['Industry Sector'] == 'Others'].shape[0]

print(f'Mining count: {Mining_cnt} i.e. {round(Mining_cnt/total_row_cnt*100, 0)}%')
print(f'Metals count: {Metals_cnt} i.e. {round(Metals_cnt/total_row_cnt*100, 0)}%')
print(f'Others count: {Others_cnt} i.e. {round(Others_cnt/total_row_cnt*100, 0)}%')

print('--'*30); print('Distributon of `Industry Sector` label'); print('--'*30)

sector_cnt = np.round(data['Industry Sector'].value_counts(normalize=True) * 100)
hv.extension('bokeh')
hv.Bars(sector_cnt).opts(title="Industry Sector Count", color="#8888ff", xlabel="Sectors", ylabel="Percentage", yformatter='%d%%')\
                .opts(opts.Bars(width=500, height=300,tools=['hover'],show_grid=True))
------------------------------------------------------------
Value Counts for `Industry Sector` label
------------------------------------------------------------
Mining count: 237 i.e. 57.0%
Metals count: 134 i.e. 32.0%
Others count: 47 i.e. 11.0%
------------------------------------------------------------
Distributon of `Industry Sector` label
------------------------------------------------------------
Out[57]:
  • 57% manufacturing plants belongs to Mining sector.
  • 32% manufacturing plants belongs to Metals sector.
  • 11% manufacturing plants belongs to Others sector.
In [58]:
print('--'*30); print('Value Counts for `Accident Level` label'); print('--'*40)

I_acc_cnt = data[data['Accident Level'] == 'I'].shape[0]
II_acc_cnt = data[data['Accident Level'] == 'II'].shape[0]
III_acc_cnt = data[data['Accident Level'] == 'III'].shape[0]
IV_acc_cnt = data[data['Accident Level'] == 'IV'].shape[0]
V_acc_cnt = data[data['Accident Level'] == 'V'].shape[0]
VI_acc_cnt = data[data['Accident Level'] == 'VI'].shape[0]

print(f'Accident Level - I count: {I_acc_cnt} i.e. {round(I_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Accident Level - II count: {II_acc_cnt} i.e. {round(II_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Accident Level - III count: {III_acc_cnt} i.e. {round(III_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Accident Level - IV count: {IV_acc_cnt} i.e. {round(IV_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Accident Level - V count: {V_acc_cnt} i.e. {round(V_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Accident Level - VI count: {VI_acc_cnt} i.e. {round(VI_acc_cnt/total_row_cnt*100, 0)}%')

print('--'*30); print('Value Counts for `Potential Accident Level'); print('--'*40)

I_pot_acc_cnt = data[data['Potential Accident Level'] == 'I'].shape[0]
II_pot_acc_cnt = data[data['Potential Accident Level'] == 'II'].shape[0]
III_pot_acc_cnt = data[data['Potential Accident Level'] == 'III'].shape[0]
IV_pot_acc_cnt = data[data['Potential Accident Level'] == 'IV'].shape[0]
V_pot_acc_cnt = data[data['Potential Accident Level'] == 'V'].shape[0]
VI_pot_acc_cnt = data[data['Potential Accident Level'] == 'VI'].shape[0]

print(f'Potential Accident Level - I count: {I_pot_acc_cnt} i.e. {round(I_pot_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Potential Accident Level - II count: {II_pot_acc_cnt} i.e. {round(II_pot_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Potential Accident Level - III count: {III_pot_acc_cnt} i.e. {round(III_pot_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Potential Accident Level - IV count: {IV_pot_acc_cnt} i.e. {round(IV_pot_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Potential Accident Level - V count: {V_pot_acc_cnt} i.e. {round(V_pot_acc_cnt/total_row_cnt*100, 0)}%')
print(f'Potential Accident Level - VI count: {VI_pot_acc_cnt} i.e. {round(VI_pot_acc_cnt/total_row_cnt*100, 0)}%')

print('--'*30); print('Distributon of `Accident Level` & `Potential Accident Level` label'); print('--'*40)

ac_level_cnt = np.round(data['Accident Level'].value_counts(normalize=True) * 100)
pot_ac_level_cnt = np.round(data['Potential Accident Level'].value_counts(normalize=True) * 100, decimals=1)
ac_pot = pd.concat([ac_level_cnt, pot_ac_level_cnt], axis=1,sort=False).fillna(0).rename(columns={'Accident Level':'Accident', 'Potential Accident Level':'Potential'})
ac_pot = pd.melt(ac_pot.reset_index(), ['index']).rename(columns={'index':'Severity', 'variable':'Levels'})
hv.extension('bokeh')
hv.Bars(ac_pot, ['Severity', 'Levels'], 'value').opts(opts.Bars(title="Accident Levels Count", width=700, height=300,tools=['hover'],\
                                                                show_grid=True,xrotation=45, ylabel="Percentage", yformatter='%d%%'))
------------------------------------------------------------
Value Counts for `Accident Level` label
--------------------------------------------------------------------------------
Accident Level - I count: 309 i.e. 74.0%
Accident Level - II count: 40 i.e. 10.0%
Accident Level - III count: 31 i.e. 7.0%
Accident Level - IV count: 30 i.e. 7.0%
Accident Level - V count: 8 i.e. 2.0%
Accident Level - VI count: 0 i.e. 0.0%
------------------------------------------------------------
Value Counts for `Potential Accident Level
--------------------------------------------------------------------------------
Potential Accident Level - I count: 45 i.e. 11.0%
Potential Accident Level - II count: 95 i.e. 23.0%
Potential Accident Level - III count: 106 i.e. 25.0%
Potential Accident Level - IV count: 141 i.e. 34.0%
Potential Accident Level - V count: 30 i.e. 7.0%
Potential Accident Level - VI count: 1 i.e. 0.0%
------------------------------------------------------------
Distributon of `Accident Level` & `Potential Accident Level` label
--------------------------------------------------------------------------------
Out[58]:
  • The number of accidents decreases as the Accident Level increases.
  • The number of accidents increases as the Potential Accident Level increases.
In [59]:
print('--'*30); print('Value Counts for `Gender` label'); print('--'*30)

Male_cnt = data[data['Gender'] == 'Male'].shape[0]
Female_cnt = data[data['Gender'] == 'Female'].shape[0]

print(f'Male count: {Male_cnt} i.e. {round(Male_cnt/total_row_cnt*100, 0)}%')
print(f'Female count: {Female_cnt} i.e. {round(Female_cnt/total_row_cnt*100, 0)}%')

print('--'*30); print('Distributon of `Gender` label'); print('--'*30)

gender_cnt = np.round(data['Gender'].value_counts(normalize=True) * 100)
hv.extension('bokeh')
hv.Bars(gender_cnt).opts(title="Gender Count", color="#8888ff", xlabel="Gender", ylabel="Percentage", yformatter='%d%%')\
                .opts(opts.Bars(width=500, height=300,tools=['hover'],show_grid=True))
------------------------------------------------------------
Value Counts for `Gender` label
------------------------------------------------------------
Male count: 396 i.e. 95.0%
Female count: 22 i.e. 5.0%
------------------------------------------------------------
Distributon of `Gender` label
------------------------------------------------------------
Out[59]:
  • There are more men working in this industry as compared to women.
In [60]:
print('--'*30); print('Value Counts for `Employee type` label'); print('--'*30)

third_party_cnt = data[data['Employee type'] == 'Third Party'].shape[0]
emp_cnt = data[data['Employee type'] == 'Employee'].shape[0]
third_rem_cnt = data[data['Employee type'] == 'Third Party (Remote)'].shape[0]

print(f'Third Party count: {third_party_cnt} i.e. {round(third_party_cnt/total_row_cnt*100, 0)}%')
print(f'Employee count: {emp_cnt} i.e. {round(emp_cnt/total_row_cnt*100, 0)}%')
print(f'Third Party (Remote) count: {third_rem_cnt} i.e. {round(third_rem_cnt/total_row_cnt*100, 0)}%')

print('--'*30); print('Distributon of `Employee type` label'); print('--'*30)

emp_type_cnt = np.round(data['Employee type'].value_counts(normalize=True) * 100)
hv.extension('bokeh')
hv.Bars(emp_type_cnt).opts(title="Employee type Count", color="#8888ff", xlabel="Employee Type", ylabel="Percentage", yformatter='%d%%')\
                .opts(opts.Bars(width=500, height=300,tools=['hover'],show_grid=True))
------------------------------------------------------------
Value Counts for `Employee type` label
------------------------------------------------------------
Third Party count: 185 i.e. 44.0%
Employee count: 178 i.e. 43.0%
Third Party (Remote) count: 55 i.e. 13.0%
------------------------------------------------------------
Distributon of `Employee type` label
------------------------------------------------------------
Out[60]:
  • 44% Third party empoyees, 43% own empoyees and 13% Third party(Remote) empoyees working in this industry.
In [61]:
cr_risk_cnt = np.round(data['Critical Risk'].value_counts(normalize=True) * 100)
hv.extension('bokeh')
hv.Bars(cr_risk_cnt[::-1]).opts(title="Critical Risk Count", color="#8888ff", xlabel="Critical Risks", ylabel="Percentage", xformatter='%d%%')\
                .opts(opts.Bars(width=600, height=600,tools=['hover'],show_grid=True,invert_axes=True))
Out[61]:
  • Most of the incidents are registered as 'Others', it takes lot of time to analyze risks and reasons why the accidents occur.
In [62]:
#Checking the proportion of Industry sector in different countries
indsec_cntry_table = pd.crosstab(index = data['Industry Sector'], columns = data['Country'])
indsec_cntry_table.plot(kind = 'bar', figsize=(8,8), stacked = True)
plt.title("Proportion of Industry Sector in different countries")
plt.show()
  • Metals and Mining industry sector plants are not available in Country_03.
  • Distribution of industry sector differ significantly in each country.

NLP preprocessing

In [63]:
# Checking 5 random Descriptions and accident_levels from the data where the length of headline is > 100;

indexes = list(data.loc[data['Description'].str.len() > 100, 'Description'].index)
rands = random.sample(indexes, 5)
descriptions, accident_levels = list(data.loc[rands, 'Description']), list(data.loc[rands, 'Accident Level'])


print('--'*40); print('Distributon of accident_level where the length of Description is > 100'); print('--'*40)
_ = data.loc[indexes, 'Accident Level'].value_counts().plot(kind = 'pie', autopct = '%.0f%%', labels = ['I', 'II', 'III', 'IV', 'V'], figsize = (10, 6))
--------------------------------------------------------------------------------
Distributon of accident_level where the length of Description is > 100
--------------------------------------------------------------------------------
  • 74% of data where accident description is greater that 100 is captured in accident level I.
  • From the above random headlines, it appears that the data is mostly lower-cased. Pre-processing such as removing punctuations and lemmatization can be used.
  • There are few alphanumeric characters like 042-TC-06, Nv. 3370, CX 212 captured in description where removing these characters might help.
  • There are digits in the description for e.g. level 326, Dumper 01 where removing the digits wouldn't help.
  • 34% of data where accident description is greater than 100 is captured in potential accident level IV.
In [64]:
#Converting description to lower case
data['Cleaned_Description'] = data['Description'].apply(lambda x : x.lower())
In [65]:
#library that contains punctuation
import string
string.punctuation
Out[65]:
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
In [66]:
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text
data['Cleaned_Description']= data['Cleaned_Description'].apply(lambda x:remove_punctuation(x))
In [67]:
data['Cleaned_Description'] = data['Cleaned_Description'].apply(lambda x: re.sub(' +', ' ', x))
In [68]:
#remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('subject')
stop_words.add('http')
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
    
data['Cleaned_Description'] = data['Cleaned_Description'].apply(lambda x: remove_stopwords(x))
In [69]:
import nltk
nltk.download('wordnet')
  
#defining the function for lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
data['Cleaned_Description'] = data['Cleaned_Description'].apply(lambda text: lemmatize_words(text))
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
In [70]:
print('Get the length of each line, find the maximum length and print the maximum length line'); 
print('Length of line ranges from 64 to 672.'); print('--'*45)

# Get length of each line
data['line_length'] = data['Cleaned_Description'].str.len()

print('Minimum line length: {}'.format(data['line_length'].min()))
print('Maximum line length: {}'.format(data['line_length'].max()))
print('Line with maximum length: {}'.format(data[data['line_length'] == data['line_length'].max()]['Cleaned_Description'].values[0]))
Get the length of each line, find the maximum length and print the maximum length line
Length of line ranges from 64 to 672.
------------------------------------------------------------------------------------------
Minimum line length: 64
Maximum line length: 672
Line with maximum length: level 3490 gallery 239 holding activity bolter equipment 24 operator performs drilling first hole support right gable 7footdeep drill end drill rod break leaving thread inside drilling machine shank operator assistant decide make two empty percussion attempt free thread shank without success third attempt assistant enters 38 corrugated iron central hole rest bar embedded shank generate pressure moment operator activates percussion generates movement shank hit palm victim left hand generating described injury worker wearing safety glove time accident end corrugated iron contact left hand shaped like cane worker time accident positioned roof supported mesh split set
In [71]:
print('Get the number of words, find the maximum number of words and print the maximum number of words'); 
print('Number of words ranges from 10 to 98.'); print('--'*45)

# Get length of each line
data['nb_words'] = data['Cleaned_Description'].apply(lambda x: len(x.split(' ')))

print('Minimum number of words: {}'.format(data['nb_words'].min()))
print('Maximum number of words: {}'.format(data['nb_words'].max()))
print('Line with maximum number of words: {}'.format(data[data['nb_words'] == data['nb_words'].max()]['Cleaned_Description'].values[0]))
Get the number of words, find the maximum number of words and print the maximum number of words
Number of words ranges from 10 to 98.
------------------------------------------------------------------------------------------
Minimum number of words: 10
Maximum number of words: 98
Line with maximum number of words: performing sleeve removal maneuver hole 600 meter deep general da silva pressed one side locking nut rod together jack hold entire weight rod maneuver locking procedure effective weight rod secured steel wire rope probe winch moment driller pedro released brake winch inefficacy locking done one side chestnut without aid monkey caused sliding rod auxiliary prepared manual unlocking rod holding faucet key firmly probe tower composition shifted stem slid hand shifted downward causing left hand strike base probe tower structure causing cut 4th 5th quirodactyl employee taken hospital went medical care wound sutured 16 stitch removed 10 day activity
In [72]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

desc = data['Cleaned_Description']
wordcloud = WordCloud(width = 1500, height = 800, random_state = 1, background_color='black', min_font_size=5, max_words=300, collocations=False).generate(str(desc))
plt.figure(figsize=(15,10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
  • Most words are related to maintainance, accident, employee, euipment, infrastructure.
In [73]:
models_list_accidentlevel = ["tf_vectorizer with naive_bayes","tf_vectorizer with SVC","cv_vectorizer with KNeighborsClassifier","cv_vectorizer with SVC",
                   "tf_vectorizer with KNeighborsClassifier"]

acc1 = 80
acc2 = 80
acc3 = 80
acc4 = 78
acc5 = 78

accuracy_accidentlevel = [acc1, acc2, acc3, acc4, acc5]


df_acc = pd.DataFrame(list(zip(models_list_accidentlevel, accuracy_accidentlevel)))
df_acc.columns =['models_list_accidentlevel', 'accuracy_accidentlevel']
df_acc
Out[73]:
models_list_accidentlevel accuracy_accidentlevel
0 tf_vectorizer with naive_bayes 80
1 tf_vectorizer with SVC 80
2 cv_vectorizer with KNeighborsClassifier 80
3 cv_vectorizer with SVC 78
4 tf_vectorizer with KNeighborsClassifier 78
In [74]:
import matplotlib.pyplot as plt
y=["tf_vectorizer with naive_bayes","tf_vectorizer with SVC","cv_vectorizer with KNeighborsClassifier","cv_vectorizer with SVC",
                   "tf_vectorizer with KNeighborsClassifier"]
 
# getting values against each value of y
x=[acc1, acc2, acc3, acc4, acc5]
plt.barh(y, x)
 
# setting label of y-axis
plt.ylabel("Models to predict Accident level")
 
# setting label of x-axis
plt.xlabel("Accuracy")
plt.title("Comparison of accuracies of Top 5 models for Accident level")
plt.show()

In comparison of all the above models for target label Accident level, we can say that LSTM with Glove embedding and BiLSTM with Glove embedding are having better accuracies than others.